Step 1:

## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by= "state")

Step 2. Look at the data

dim(cv_states)
## [1] 52894     9
head(cv_states)
##     state       date fips   cases deaths geo_id population pop_density abb
## 1 Alabama 2022-10-13    1 1528739  20505      1    4887871    96.50939  AL
## 2 Alabama 2021-07-08    1  552911  11387      1    4887871    96.50939  AL
## 3 Alabama 2021-09-24    1  784484  13921      1    4887871    96.50939  AL
## 4 Alabama 2022-03-16    1 1290165  18944      1    4887871    96.50939  AL
## 5 Alabama 2022-11-07    1 1534287  20558      1    4887871    96.50939  AL
## 6 Alabama 2020-08-02    1   91444   1627      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 52889 Wyoming 2021-05-24   56  59797    713     56     577737    5.950611  WY
## 52890 Wyoming 2022-11-16   56 179838   1924     56     577737    5.950611  WY
## 52891 Wyoming 2022-05-04   56 156745   1814     56     577737    5.950611  WY
## 52892 Wyoming 2021-07-17   56  63523    760     56     577737    5.950611  WY
## 52893 Wyoming 2022-04-27   56 156550   1812     56     577737    5.950611  WY
## 52894 Wyoming 2022-06-18   56 162160   1824     56     577737    5.950611  WY
str(cv_states)
## 'data.frame':    52894 obs. of  9 variables:
##  $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ date       : IDate, format: "2022-10-13" "2021-07-08" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  1528739 552911 784484 1290165 1534287 91444 843161 836255 587405 1531305 ...
##  $ deaths     : int  20505 11387 13921 18944 20558 1627 16073 15834 11536 20533 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE 
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame':    52894 obs. of  9 variables:
##  $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date       : Date, format: "2020-03-13" "2020-03-14" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
##  $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 664 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 15  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 400 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 89  Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 505 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 234 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 52121 Wyoming 2022-12-08   56 181695   1939     56     577737    5.950611  WY
## 52704 Wyoming 2022-12-09   56 181695   1939     56     577737    5.950611  WY
## 52108 Wyoming 2022-12-10   56 181695   1939     56     577737    5.950611  WY
## 52654 Wyoming 2022-12-11   56 181695   1939     56     577737    5.950611  WY
## 52467 Wyoming 2022-12-12   56 181695   1939     56     577737    5.950611  WY
## 52765 Wyoming 2022-12-13   56 182198   1951     56     577737    5.950611  WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 664 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 15  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 400 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 89  Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 505 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 234 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
##            state            date                 fips           cases         
##  Washington   : 1058   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
##  Illinois     : 1055   1st Qu.:2020-11-11   1st Qu.:16.00   1st Qu.:   96570  
##  California   : 1054   Median :2021-07-23   Median :29.00   Median :  357800  
##  Arizona      : 1053   Mean   :2021-07-22   Mean   :29.78   Mean   :  848487  
##  Massachusetts: 1047   3rd Qu.:2022-04-03   3rd Qu.:44.00   3rd Qu.: 1003619  
##  Wisconsin    : 1043   Max.   :2022-12-13   Max.   :72.00   Max.   :11630402  
##  (Other)      :46584                                                          
##      deaths           geo_id        population        pop_density       
##  Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
##  1st Qu.:  1410   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
##  Median :  5279   Median :29.00   Median : 4468402   Median :  107.860  
##  Mean   : 11680   Mean   :29.78   Mean   : 6402373   Mean   :  422.968  
##  3rd Qu.: 14699   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
##  Max.   :100252   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
##                                                      NA's   :1006       
##       abb       
##  WA     : 1058  
##  IL     : 1055  
##  CA     : 1054  
##  AZ     : 1053  
##  MA     : 1047  
##  WI     : 1043  
##  (Other):46584
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-12-13"

4. Add new_cases and new_deaths and correct outliers

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]
  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]
  ### FINISH THE CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
  }
  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]
  ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$new_cases[j-1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$new_deaths[j-1]
  }
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL

5. Add additional variables

### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))

6. Explore scatterplots using plot_ly()

### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , 
                         paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
                  yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()

### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_smooth()
ggplotly(p)

8. Multiple line chart

### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>% add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")